{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第06课：动手实战基于 ML 的中文短文本分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import jieba\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#加载停用词\n",
    "stopwords=pd.read_csv('stopwords.txt',index_col=False,quoting=3,sep=\"\\t\",names=['stopword'], encoding='utf-8')\n",
    "stopwords=stopwords['stopword'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['!', '\"', '#', ..., '450', '22549', '22544'], dtype=object)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#加载语料\n",
    "laogong_df = pd.read_csv('../data/06/beilaogongda.csv', encoding='utf-8', sep=',')\n",
    "laopo_df = pd.read_csv('../data/06/beilaogongda.csv', encoding='utf-8', sep=',')\n",
    "erzi_df = pd.read_csv('../data/06/beierzida.csv', encoding='utf-8', sep=',')\n",
    "nver_df = pd.read_csv('../data/06/beinverda.csv', encoding='utf-8', sep=',')\n",
    "#删除语料的nan行\n",
    "laogong_df.dropna(inplace=True)\n",
    "laopo_df.dropna(inplace=True)\n",
    "erzi_df.dropna(inplace=True)\n",
    "nver_df.dropna(inplace=True)\n",
    "#转换\n",
    "laogong = laogong_df.segment.values.tolist()\n",
    "laopo = laopo_df.segment.values.tolist()\n",
    "erzi = erzi_df.segment.values.tolist()\n",
    "nver = nver_df.segment.values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#定义分词和打标签函数preprocess_text\n",
    "#参数content_lines即为上面转换的list\n",
    "#参数sentences是定义的空list，用来储存打标签之后的数据\n",
    "#参数category 是类型标签\n",
    "def preprocess_text(content_lines, sentences, category):\n",
    "    for line in content_lines:\n",
    "        try:\n",
    "            segs=jieba.lcut(line)\n",
    "            segs = [v for v in segs if not str(v).isdigit()]#去数字\n",
    "            segs = list(filter(lambda x:x.strip(), segs))   #去左右空格\n",
    "            segs = list(filter(lambda x:len(x)>1, segs)) #长度为1的字符\n",
    "            segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词\n",
    "            sentences.append((\" \".join(segs), category))# 打标签\n",
    "        except Exception:\n",
    "            print(line)\n",
    "            continue "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/yw/k7z_d_3567g16ss9plk47x9w0000gn/T/jieba.cache\n",
      "Loading model cost 0.661 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "sentences = []\n",
    "preprocess_text(laogong, sentences,0)\n",
    "preprocess_text(laopo, sentences, 1)\n",
    "preprocess_text(erzi, sentences, 2)\n",
    "preprocess_text(nver, sentences, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "random.shuffle(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "老公 人伤 通知 救护车 致电 持械 情况不明 民警 携带 防护 装备 1\n",
      "报警 女儿 无需 民警 到场 3\n",
      "报警 人称 女儿 持械 人伤 情况 不详 民警 到场 3\n",
      "报警 老公 民警 到场 0\n",
      "报警 丈夫 持械 人伤 无需 救护 民警 到场 民警 防护 设备 1\n",
      "报警 老公 棍子 人伤 民警 携带 防护 装备 到场 1\n",
      "报警 人称 儿子 民警 到场 2\n",
      "报警 女儿 人伤 民警 到场 携带 防护 装备 3\n",
      "报警 老公 民警 到场 1\n",
      "报警 儿子 民警 到场 2\n"
     ]
    }
   ],
   "source": [
    "for sentence in sentences[:10]:\n",
    "        print(sentence[0], sentence[1])  #下标0是词列表，1是标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vec = CountVectorizer(\n",
    "    analyzer='word', # tokenise by character ngrams\n",
    "    max_features=4000,  # keep the most common 1000 ngrams\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "x, y = zip(*sentences)\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=4000, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "        tokenizer=None, vocabulary=None)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec.fit(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "classifier = MultinomialNB()\n",
    "classifier.fit(vec.transform(x_train), y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6357308584686775\n"
     ]
    }
   ],
   "source": [
    "print(classifier.score(vec.transform(x_test), y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pre = classifier.predict(vec.transform(x_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.568445475638051\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vec = CountVectorizer(\n",
    "    analyzer='word', # tokenise by character ngrams\n",
    "    ngram_range=(1,4),  # use ngrams of size 1 and 2\n",
    "    max_features=20000,  # keep the most common 1000 ngrams\n",
    ")\n",
    "vec.fit(x_train)\n",
    "#用朴素贝叶斯算法进行模型训练\n",
    "classifier = MultinomialNB()\n",
    "classifier.fit(vec.transform(x_train), y_train)\n",
    "#对结果进行评分\n",
    "print(classifier.score(vec.transform(x_test), y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5522041763341067\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "svm = SVC(kernel='linear')\n",
    "svm.fit(vec.transform(x_train), y_train)\n",
    "print(svm.score(vec.transform(x_test), y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "ename": "UnboundLocalError",
     "evalue": "local variable 'lib' referenced before assignment",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-20-0ee9a9f4ac0b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mxgboost\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/xgboost/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDMatrix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBooster\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrabit\u001b[0m                   \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m    134\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    135\u001b[0m \u001b[0;31m# load the XGBoost library globally\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 136\u001b[0;31m \u001b[0m_LIB\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_load_lib\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    138\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m_load_lib\u001b[0;34m()\u001b[0m\n\u001b[1;32m    126\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m             \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m     \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXGBGetLastError\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrestype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_char_p\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    129\u001b[0m     \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_log_callback_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXGBRegisterLogCallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'lib' referenced before assignment"
     ],
     "output_type": "error"
    }
   ],
   "source": [
    "import xgboost as xgb  \n",
    "from sklearn.model_selection import StratifiedKFold  \n",
    "import numpy as np\n",
    "# xgb矩阵赋值  \n",
    "xgb_train = xgb.DMatrix(vec.transform(x_train), label=y_train)  \n",
    "xgb_test = xgb.DMatrix(vec.transform(x_test)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
